# import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import dalex as dx
import warnings
warnings.filterwarnings('ignore')
import plotly
plotly.offline.init_notebook_mode()
pd.set_option('display.max_columns', None)
df = pd.read_csv('hotel_bookings.csv')
df.head()
| hotel | is_canceled | lead_time | arrival_date_year | arrival_date_month | arrival_date_week_number | arrival_date_day_of_month | stays_in_weekend_nights | stays_in_week_nights | adults | children | babies | meal | country | market_segment | distribution_channel | is_repeated_guest | previous_cancellations | previous_bookings_not_canceled | reserved_room_type | assigned_room_type | booking_changes | deposit_type | agent | company | days_in_waiting_list | customer_type | adr | required_car_parking_spaces | total_of_special_requests | reservation_status | reservation_status_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Resort Hotel | 0 | 342 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 3 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 1 | Resort Hotel | 0 | 737 | 2015 | July | 27 | 1 | 0 | 0 | 2 | 0.0 | 0 | BB | PRT | Direct | Direct | 0 | 0 | 0 | C | C | 4 | No Deposit | NaN | NaN | 0 | Transient | 0.0 | 0 | 0 | Check-Out | 2015-07-01 |
| 2 | Resort Hotel | 0 | 7 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Direct | Direct | 0 | 0 | 0 | A | C | 0 | No Deposit | NaN | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 3 | Resort Hotel | 0 | 13 | 2015 | July | 27 | 1 | 0 | 1 | 1 | 0.0 | 0 | BB | GBR | Corporate | Corporate | 0 | 0 | 0 | A | A | 0 | No Deposit | 304.0 | NaN | 0 | Transient | 75.0 | 0 | 0 | Check-Out | 2015-07-02 |
| 4 | Resort Hotel | 0 | 14 | 2015 | July | 27 | 1 | 0 | 2 | 2 | 0.0 | 0 | BB | GBR | Online TA | TA/TO | 0 | 0 | 0 | A | A | 0 | No Deposit | 240.0 | NaN | 0 | Transient | 98.0 | 0 | 1 | Check-Out | 2015-07-03 |
df.shape
(119390, 32)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 119390 entries, 0 to 119389 Data columns (total 32 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 hotel 119390 non-null object 1 is_canceled 119390 non-null int64 2 lead_time 119390 non-null int64 3 arrival_date_year 119390 non-null int64 4 arrival_date_month 119390 non-null object 5 arrival_date_week_number 119390 non-null int64 6 arrival_date_day_of_month 119390 non-null int64 7 stays_in_weekend_nights 119390 non-null int64 8 stays_in_week_nights 119390 non-null int64 9 adults 119390 non-null int64 10 children 119386 non-null float64 11 babies 119390 non-null int64 12 meal 119390 non-null object 13 country 118902 non-null object 14 market_segment 119390 non-null object 15 distribution_channel 119390 non-null object 16 is_repeated_guest 119390 non-null int64 17 previous_cancellations 119390 non-null int64 18 previous_bookings_not_canceled 119390 non-null int64 19 reserved_room_type 119390 non-null object 20 assigned_room_type 119390 non-null object 21 booking_changes 119390 non-null int64 22 deposit_type 119390 non-null object 23 agent 103050 non-null float64 24 company 6797 non-null float64 25 days_in_waiting_list 119390 non-null int64 26 customer_type 119390 non-null object 27 adr 119390 non-null float64 28 required_car_parking_spaces 119390 non-null int64 29 total_of_special_requests 119390 non-null int64 30 reservation_status 119390 non-null object 31 reservation_status_date 119390 non-null object dtypes: float64(4), int64(16), object(12) memory usage: 29.1+ MB
# Replace missing values:
# agent: If no agency is given, booking was most likely made without one.
# company: If none given, it was most likely private.
# rest schould be self-explanatory
nan_replacements = {"children:": 0.0, "country": "Unknown", "agent": 0, "company": 0}
df = df.fillna(nan_replacements)
# "meal" contains values "Undefined", which is equal to SC.
df["meal"].replace("Undefined", "SC", inplace=True)
# Some rows contain entreis with 0 adults, 0 children and 0 babies.
# I'm dropping these entries with no guests.
zero_guests = list(df.loc[df["adults"] + df["children"] + df["babies"]==0].index)
df.drop(df.index[zero_guests], inplace=True)
# feature engineering
df["adr_pp"] = df["adr"] / (df["adults"] + df["children"])
df["total_nights"] = df["stays_in_weekend_nights"] + df["stays_in_week_nights"]
# manually choose columns to include
# some columns are excluded to make the model more general and to prevent leakage
# (arrival_date_year, assigned_room_type, booking_changes, reservation_status, country,
# days_in_waiting_list, hotel)
# including the country would increase accuracy, but it may also make the model less general and make not fair
num_features = ["lead_time","arrival_date_week_number","arrival_date_day_of_month",
"stays_in_weekend_nights", "stays_in_week_nights", "total_nights",
"adults","children", "babies",
"is_repeated_guest", "previous_cancellations", "previous_bookings_not_canceled",
"agent","company",
"required_car_parking_spaces", "total_of_special_requests", "adr", "adr_pp"]
cat_features = ["arrival_date_month", "meal",
"market_segment", "distribution_channel","reserved_room_type","deposit_type","customer_type"]
# Separate features and predicted value
features = num_features + cat_features
# separate features and target
X = df.drop(["is_canceled"], axis=1)[features]
y = df["is_canceled"]
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)
For the simplicity models were separately trained in another notebook, here we only load trained models.
import pickle
xgb_model = pickle.load(open('models/xgb', 'rb'))
Let's check the influence of an explanatory variables on a model’s predictions. For this purpose, we can use Partial Dependence Profiles and Accumulated Local Effects plots. First let's create explainer object for each trained model.
xgb_explainer = dx.Explainer(xgb_model, X, y)
Preparation of a new explainer is initiated -> data : 119210 rows 25 cols -> target variable : Parameter 'y' was a pandas.Series. Converted to a numpy.ndarray. -> target variable : 119210 values -> model_class : xgboost.sklearn.XGBClassifier (default) -> label : Not specified, model's class short name will be used. (default) -> predict function : <function yhat_proba_default at 0x000001329C30A0D0> will be used (default) -> predict function : Accepts only pandas.DataFrame, numpy.ndarray causes problems. -> predicted values : min = 2.54e-06, mean = 0.368, max = 1.0 -> model type : classification will be used (default) -> residual function : difference between y and yhat (default) -> residuals : min = -1.0, mean = 0.00283, max = 1.0 -> model_info : package sklearn A new explainer has been created!
pdp_xgb = xgb_explainer.model_profile()
ale_xgb = xgb_explainer.model_profile(type = 'accumulated')
pdp_xgb.result['_label_'] = 'PDP profiles'
ale_xgb.result['_label_'] = 'ALE profiles'
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:09<00:00, 2.66it/s] Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:09<00:00, 2.60it/s] Calculating accumulated dependency: 100%|██████████████████████████████████████████████| 18/18 [00:02<00:00, 6.27it/s]
pdp_xgb.plot(ale_xgb)
As we can observe increasing the lead_time up to 30 days, results in a linear increase of the probability of cancellation. We may see that changing the number of previous cancellations to any non-zero value changes predictions completely. Also in case of the variables total_of_special_requests and required_car_parking_spaces increasing their values to any greater than zero value would cause decrease in the probability of cancelation. It is also interesing to note that number of adults, children or babies doesn't affect model's prediction at all. Since the PD and ALE profiles are parallel to each other, they suggest that the model is additive for selected explanatory variables.
Considering grouped Partial-dependence profiles we may discover some interesting property of trained model. For customers who don't have refundable deposit of their booking our model nearly always predict cancellation and any explanatory variable can't change it. It seems to be dubious and may suggest that would be better to train different model without this variable.
pdp_deposit_type = xgb_explainer.model_profile(groups = 'deposit_type')
pdp_deposit_type.plot()
Calculating ceteris paribus: 100%|█████████████████████████████████████████████████████| 25/25 [00:10<00:00, 2.47it/s]